import pandas as pd
import numpy as np
import numerapi
import os
import plotly.express as px
import plotly.graph_objects as go
import catboost
#pca from sklearn
from sklearn.decomposition import PCA
training_set = pd.read_parquet("data/numerai_training_data.parquet")
validation_set = pd.read_parquet("data/numerai_validation_data.parquet")
feature_names = [f for f in training_set.columns if "feature_" in f]
# train catboost model with all features,
# the train on several different numbers of PCA components
# test for erawise correlation on the validation set
N_pca_features = [None, len(feature_names), len(feature_names)//2, len(feature_names)//4, len(feature_names)//8]
N_pca_features
[None, 1050, 525, 262, 131]
params = {
"iterations":1000,
"learning_rate":0.01,
"depth":6,
"task_type":'GPU',
"verbose":False,
}
mean_corrs = []
for item in N_pca_features:
model = catboost.CatBoostRegressor(**params)
if item is None:
model.fit(training_set[feature_names], training_set["target"])
validation_set["validation_prediction"] = model.predict(validation_set[feature_names])
else:
pca = PCA(n_components=item)
pca.fit(training_set[feature_names])
training_set_pca = pca.transform(training_set[feature_names])
validation_set_pca = pca.transform(validation_set[feature_names])
model.fit(training_set_pca, training_set["target"])
validation_set["validation_prediction"] = model.predict(validation_set_pca)
era_wise_correlations = validation_set.groupby("era").apply(
lambda era: np.corrcoef(era["validation_prediction"], era["target"])[0, 1]
)
mean_corr = era_wise_correlations.mean()
print(f"{item} PCA components: {mean_corr}")
mean_corrs.append(mean_corr)
None PCA components: 0.022954946780619122 1050 PCA components: 0.012031665005416482 525 PCA components: 0.01199597158943061 262 PCA components: 0.012240655156281562 131 PCA components: 0.01108041763419352
# plot the results
import plotly.express as px
#convert all items in N_pca_features to strings
N_pca_features = [str(item) + " PCA Components" for item in N_pca_features if item is not None]
#replace None in N_pca_features with "No PCA"
N_pca_features[0] = "No PCA (1050 features)"
# Y axis is the mean correlation add to labels
# X axis is the number of PCA components add to labels
fig = px.bar(x=N_pca_features, y=mean_corrs)
fig.update_layout(title_text="Mean Correlation of Validation Set Predictions",
xaxis_title="Number of PCA Components",
yaxis_title="Mean Correlation")
fig.show(renderer="notebook")